Code
library(ggplot2)
data(mpg)
ggplot(data=mpg, mapping=aes(x=displ, y=hwy)) +
geom_point()MSDA - Bootcamp 2025 Summer
KT Wong
August 1, 2025
it starts from the grammar of graphics Wickham (2016)
The mpg dataset is a tibble, a modern version of a data frame
The mpg dataset is part of the ggplot2 package
The mpg dataset is a tidy dataset
This dataset suggests many interesting questions
List five functions that you could use to get more information about the mpg dataset
How can you find out what other datasets are included with ggplot2?
Apart from the US, most countries use fuel consumption (fuel consumed over fixed distance) rather than fuel economy (distance travelled with fixed amount of fuel). How could you convert cty and hwy into the European standard of l/100km?
Which manufacturer has the most models in this dataset?
. . .
. . .
library(ggthemes)
ggplot(mpg, aes(displ, hwy)) +
geom_point(aes(color=class)) +
labs(x="Engine size (litres)",
y="Highway fuel economy (miles per gallon)",
title="Relationship between engine size and fuel economy",
color="Car type",
caption="Source: mpg dataset")+
theme_economist()+
scale_color_tableau()# A tibble: 6 × 8
partner year partner_name product product_name US_report_import pop2000
<chr> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 ARE 1998 United Arab Emira… 950341 "Toys repre… 1.06 3.25e6
2 ARE 2000 United Arab Emira… 950349 "Toys repre… 12.0 3.25e6
3 ARE 2003 United Arab Emira… 950349 "Toys repre… 4.65 3.25e6
4 ARE 2005 United Arab Emira… 950320 "Reduced-si… 49.2 3.25e6
5 ARG 1996 Argentina 950341 "Toys repre… 0 3.69e7
6 ARG 1996 Argentina 950310 "Electric t… 10.8 3.69e7
# ℹ 1 more variable: region <dbl>
. . .
# A tibble: 5 × 2
partner_name total_import
<chr> <dbl>
1 China 26842305.
2 Denmark 1034990.
3 Canada 572309.
4 Hong Kong, China 545186.
5 Switzerland 400969.
the total dollar value of toys imported to the U.S. (US_report_import, in multiples of $1,000) in a specific product category from a specific country in a specific year
The product categories have unique numerical codes (product) as well as product names exciting enough to quicken the heart of any toy-loving child (“Parts and accessories :– Other,” “Toys representing animal or non-human figures,” and so on
Group all the observations by trading partner (the partner_name variable)
For each partner, calculate total dollar value by summing toy imports (US_report_import) across all categories and years
Arrange the partners by total dollar value
#| out-width: 100%
top5_partners=c("China", "Denmark", "Canada", "Hong Kong, China", "Switzerland")
options(scipen = 999)
library(ggthemes)
library(scales)
library(plotly)
p <- toy_imports %>%
filter(partner_name %in% top5_partners) %>%
group_by(year, partner_name) %>%
summarize(total_import=sum(US_report_import)) %>%
ggplot(aes(year, total_import, color=partner_name)) +
geom_line()+
labs(title="Toy imports from the U.S.'s top-5 partners, 1996-2005",
x="Year",
y="Dollar value of imports (log scale)",
color="Import Region")+
scale_x_continuous(breaks=1996:2005)+
theme_economist()+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)))
ggplotly(p)library(tidyverse)
rapidcity <- read_csv("https://raw.githubusercontent.com/kwan-MSDA/Bootcamp_2024/main/dataset/rapidcity.csv")
rapidcity %>%
group_by(Year, Month) %>%
summarize(avg_Temp = mean(Temp),
lowest_temp = min(Temp),
hightest_temp = max(Temp)) %>%
arrange(avg_Temp) %>%
head(5) %>%
round(1)# A tibble: 5 × 5
# Groups: Year [4]
Year Month avg_Temp lowest_temp hightest_temp
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1996 1 14.9 -11 46.1
2 2009 12 16.4 -2.6 35.6
3 2000 12 17.3 -9 38.8
4 1996 12 17.5 -10.8 40.4
5 2001 2 17.6 -3.9 40.8
Q: how did survival among adult passengers vary by sex and cabin class?
# A tibble: 6 × 5
name survived sex age passengerClass
<chr> <chr> <chr> <dbl> <chr>
1 Allen, Miss. Elisabeth Walton yes female 29 1st
2 Allison, Master. Hudson Trevor yes male 0.917 1st
3 Allison, Miss. Helen Loraine no female 2 1st
4 Allison, Mr. Hudson Joshua Crei no male 30 1st
5 Allison, Mrs. Hudson J C (Bessi no female 25 1st
6 Anderson, Mr. Harry yes male 48 1st
# A tibble: 6 × 5
# Groups: sex [2]
sex passengerClass total_count survived survival_rate
<chr> <chr> <int> <int> <dbl>
1 female 1st 125 121 0.968
2 female 2nd 85 74 0.871
3 female 3rd 106 47 0.443
4 male 1st 144 47 0.326
5 male 2nd 143 12 0.0839
6 male 3rd 289 45 0.156
# install.packages('devtools')
#devtools::install_github('bbc/bbplot'))
library(ggpubr)
source("https://raw.githubusercontent.com/kwan-MSDA/R/main/bbc_style.R")
gapminder %>%
group_by(year, continent) %>%
summarize(median_lifeExp = median(lifeExp)) %>%
ggplot(aes(year, median_lifeExp, color=continent)) +
geom_line()+
labs(title="Life expectancy by continent and year",
x="Year",
y="Life expectancy")+
bbc_style()library("ggalt")
library("tidyr")
library(gapminder)
dumbbell_df <- gapminder %>%
filter(year == 1967 | year == 2007) %>%
select(country, year, lifeExp) %>%
spread(year, lifeExp) %>%
mutate(gap = `2007` - `1967`) %>%
arrange(desc(gap)) %>%
head(10)
#Make plot
ggplot(dumbbell_df, aes(x = `1967`, xend = `2007`, y = reorder(country, gap), group = country)) +
geom_dumbbell(colour = "#dddddd",
size = 3,
colour_x = "#FAAB18",
colour_xend = "#1380A1") +
bbc_style() +
labs(title="We're living longer",
subtitle="Biggest life expectancy rise, 1967-2007")library(hrbrthemes)
library(viridis)
gapminder %>%
filter(year==2007) %>%
mutate(country=factor(country, levels=unique(country))) %>%
arrange(desc(pop)) %>%
ggplot(aes(x=gdpPercap, y=lifeExp, size=pop, fill=continent)) +
geom_point(alpha=0.6, shape=21, color="black")+
scale_size(range=c(.1, 24), name="Population (M)")+
scale_fill_viridis(discrete=TRUE, guide=FALSE, option="A")+
theme_ipsum()+
theme(legend.position="none")+
labs(title="Life expectancy by continent in 2007",
x="GDP per capita",
y="Life Expectancy")library(gganimate)
gapminder %>%
ggplot(aes(x=gdpPercap, y=lifeExp, size=pop, fill=continent, frame=year)) +
geom_point(alpha=0.6, shape=21, color="black")+
scale_size(range=c(.1, 22), name="Population (M)")+
scale_fill_viridis(discrete=TRUE, guide=FALSE, option="A")+
theme_ipsum()+
theme(legend.position="none")+
labs(title="Life expectancy by continent in {frame_time}",
x="GDP per capita",
y="Life Expectancy")+
geom_text(data=gapminder %>% filter(pop >1e+8), aes(label=country), size=5, nudge_x=0.1, nudge_y=0.1)+
transition_time(year)+
enter_fade()+
exit_fade()library(plotly)
library(hrbrthemes)
library(viridis)
g<- crosstalk::SharedData$new(gapminder %>%
mutate(country=factor(country, levels=unique(country))) %>%
arrange(desc(pop)),
~ continent)
gg<- g %>%
ggplot(aes(x=gdpPercap, y=lifeExp, fill=continent, frame=year)) +
geom_point(aes(size=pop, alpha=0.6, ids=country))+
scale_size(range=c(.1, 24), name="Population (M)")+
scale_fill_viridis(discrete=TRUE, guide=FALSE, option="A")+
scale_alpha(range=c(0.6, 1), guide=FALSE)+
theme_ipsum()+
# theme(legend.position="none")+
labs(title="Life expectancy by continent between 1952-2007",
x="GDP per capita",
y="Life Expectancy")
ggplotly(gg, height = 500, width = 800)